{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "f60b73e6",
   "metadata": {},
   "source": [
    "# Chapter 7 - Clustering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "02f20686",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "017b03c5",
   "metadata": {},
   "source": [
    "## Simple Exact Match Clustering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "647444d2",
   "metadata": {},
   "outputs": [],
   "source": [
    "table = [\n",
    "    [0,'Michael','Shearer',1970],\n",
    "    [1,'Michael','Shearer', 1970],\n",
    "    [2,'Mike','Shearer', 1970],\n",
    "    [3,'Michael','Shearer',1971],\n",
    "    [4,'Michelle','Shearer',1971],\n",
    "    [5,'Mike','Sheare', 1971]]\n",
    "clmns = ['ID','Firstname','Lastname','Year']\n",
    "df_ms = pd.DataFrame(table, columns = clmns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "04adf3e0",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_ms['cluster'] = df_ms.groupby(['Firstname','Lastname']).ngroup()\n",
    "df_ms"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "13566603",
   "metadata": {},
   "source": [
    "## Approximate Match Clustering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "73346683",
   "metadata": {},
   "outputs": [],
   "source": [
    "import itertools\n",
    "\n",
    "df_combs = pd.DataFrame(list(itertools.combinations(table,2)), columns=['A','B'])\n",
    "df_combs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8db2f263",
   "metadata": {},
   "outputs": [],
   "source": [
    "clmnsA = pd.MultiIndex.from_arrays([['A']*len(clmns), clmns])\n",
    "clmnsB = pd.MultiIndex.from_arrays([['B']*len(clmns), clmns])\n",
    "\n",
    "df_edges = pd.concat([pd.DataFrame(df_combs['A'].values.tolist(), columns = clmnsA),\n",
    "                     pd.DataFrame(df_combs['B'].values.tolist(), columns = clmnsB)], axis=1)\n",
    "df_edges"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7377f460",
   "metadata": {},
   "outputs": [],
   "source": [
    "#%pip install jellyfish\n",
    "import jellyfish as jf\n",
    "\n",
    "def is_match(row):\n",
    "    firstname_match = jf.jaro_winkler_similarity(row['A']['Firstname'], row['B']['Firstname']) > 0.9\n",
    "    lastname_match = jf.jaro_winkler_similarity(row['A']['Lastname'], row['B']['Lastname']) > 0.9\n",
    "    return firstname_match and lastname_match\n",
    "\n",
    "df_edges['Match'] = df_edges.apply(is_match, axis=1)\n",
    "df_edges"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1c1d8ddf",
   "metadata": {},
   "outputs": [],
   "source": [
    "#%pip install networkx\n",
    "import networkx as nx\n",
    "\n",
    "G = nx.from_pandas_edgelist(df_edges[df_edges['Match']], source=('A','ID'), target=('B','ID'))\n",
    "list(nx.connected_components(G))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "afbdfca5",
   "metadata": {},
   "source": [
    "# Sample Problem"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b1ed38e1",
   "metadata": {},
   "source": [
    "## Step 1 - Data Acquisition"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d47135d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Reload raw file from Chapter 5\n",
    "\n",
    "df_psc = pd.read_csv('psc_raw.csv',dtype={'data.name_elements.surname':'string',\n",
    "                                           'data.name_elements.forename':'string',\n",
    "                                           'data.name_elements.middle_name':'string',\n",
    "                                           'data.name_elements.title':'string',\n",
    "                                           'data.nationality':'string'})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7149f7ed",
   "metadata": {},
   "source": [
    "## Step 2 - Data Standardization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ca9e4280",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_psc = df_psc.dropna(subset=['data.date_of_birth.year','data.date_of_birth.month'])\n",
    "df_psc['Year'] = df_psc['data.date_of_birth.year'].astype('int64')\n",
    "df_psc['Month'] = df_psc['data.date_of_birth.month'].astype('int64')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d19e1fdd",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_psc = df_psc.rename(columns=\n",
    "                   {\"data.name_elements.surname\" : \"Lastname\",\n",
    "                    \"data.name_elements.forename\" : \"Firstname\",\n",
    "                    \"data.name_elements.middle_name\" : \"Middlename\",\n",
    "                    \"data.name_elements.title\" : \"Title\",\n",
    "                    \"data.nationality\" : \"Nationality\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5bc49ec5",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_psc = df_psc[['Lastname','Middlename','Firstname','company_number','Year','Month','Title','Nationality']]\n",
    "df_psc['unique_id'] = df_psc.index"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fa8f9447",
   "metadata": {},
   "source": [
    "## Step 3 - Record Blocking and Attribute Comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c99d2e55",
   "metadata": {},
   "outputs": [],
   "source": [
    "from splink.duckdb.linker import DuckDBLinker\n",
    "from splink.duckdb import comparison_library as cl\n",
    "\n",
    "settings = {\n",
    "    \"link_type\": \"dedupe_only\",\n",
    "    \"blocking_rules_to_generate_predictions\": [\n",
    "        \"l.Year = r.Year and l.Month = r.Month and l.Lastname = r.Lastname\"\n",
    "    ],\n",
    "    \"comparisons\": [\n",
    "        cl.jaro_winkler_at_thresholds(\"Firstname\", [0.9]),\n",
    "        cl.jaro_winkler_at_thresholds(\"Middlename\", [0.9]),\n",
    "        cl.exact_match(\"Lastname\"),\n",
    "        cl.exact_match(\"Title\"),\n",
    "        cl.exact_match(\"Nationality\"),\n",
    "        cl.exact_match(\"Month\"),\n",
    "        cl.exact_match(\"Year\", term_frequency_adjustments=True),\n",
    "    ],\n",
    "    \"retain_matching_columns\": True,\n",
    "    \"retain_intermediate_calculation_columns\": True,\n",
    "    \"max_iterations\": 10,\n",
    "    \"em_convergence\": 0.01,\n",
    "    \"additional_columns_to_retain\": [\"company_number\"],\n",
    "}\n",
    "linker = DuckDBLinker(df_psc, settings)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bbebff9b",
   "metadata": {},
   "source": [
    "### Data Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6b44ddf3",
   "metadata": {},
   "outputs": [],
   "source": [
    "linker.profile_columns([\"Firstname\",\"Middlename\",\"Lastname\",\"Title\",\"Nationality\",\"Month\",\"Year\"], top_n=10, bottom_n=5)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d81b34c4",
   "metadata": {},
   "source": [
    "### Test Blocking Rules"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5812d6c0",
   "metadata": {},
   "outputs": [],
   "source": [
    "linker.count_num_comparisons_from_blocking_rule(\"l.Lastname = r.Lastname and l.Month = r.Month and l.Title = r.Title and l.Nationality = r.Nationality\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "45796530",
   "metadata": {},
   "outputs": [],
   "source": [
    "linker.count_num_comparisons_from_blocking_rule(\"l.Firstname = r.Firstname and l.Year = r.Year and l.Middlename = r.Middlename\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "362d8bc5",
   "metadata": {},
   "outputs": [],
   "source": [
    "linker.count_num_comparisons_from_blocking_rule(\"l.Lastname = r.Lastname and l.Middlename = r.Middlename\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d57e570",
   "metadata": {},
   "outputs": [],
   "source": [
    "linker.count_num_comparisons_from_blocking_rule(\"l.Firstname = r.Firstname and l.Month = r.Month and l.Year = r.Year and l.Title = r.Title and l.Nationality = r.Nationality\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "40e9e012",
   "metadata": {},
   "outputs": [],
   "source": [
    "linker.estimate_u_using_random_sampling(max_pairs=1e7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8bce55c5",
   "metadata": {},
   "outputs": [],
   "source": [
    "linker.estimate_parameters_using_expectation_maximisation(\"l.Lastname = r.Lastname and l.Middlename = r.Middlename\", fix_u_probabilities=False)\n",
    "linker.estimate_parameters_using_expectation_maximisation(\"l.Firstname = r.Firstname and l.Month = r.Month and l.Year = r.Year and l.Title = r.Title and l.Nationality = r.Nationality\", fix_u_probabilities=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a99f4b99",
   "metadata": {},
   "outputs": [],
   "source": [
    "#linker.save_model_to_json(\"Chapter7_Splink_Settings.json\", overwrite=True)\n",
    "linker.load_settings(\"Chapter7_Splink_Settings.json\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "95c3400e",
   "metadata": {},
   "source": [
    "## Step 4 - Match Classification and Clustering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3576b0d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_predict = linker.predict(threshold_match_probability=0.9)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f9e55385",
   "metadata": {},
   "outputs": [],
   "source": [
    "clusters = linker.cluster_pairwise_predictions_at_threshold(df_predict, threshold_match_probability=0.9)\n",
    "df_clusters = clusters.as_pandas_dataframe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3fe8fc54",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_clusters.head(n=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0a0b6c62",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_cgroup = df_clusters.groupby(['cluster_id'], sort=False)[['company_number','Firstname','Title','Nationality','Lastname']].agg(lambda x: list(set(x))).reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a3b6eaa0",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_cselect = df_cgroup[(df_cgroup['Firstname'].apply(len) > 1) &\n",
    "                  (df_cgroup['Title'].apply(len) > 1) & \n",
    "                  (df_cgroup['Nationality'].apply(len) > 1) &\n",
    "                  (df_cgroup['company_number'].apply(len) == 6)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "60e77951",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_cselect.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3a8b0a44",
   "metadata": {},
   "source": [
    "## Step 5 - Vizualisation "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aed06a64",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "\n",
    "mybins =[1,2,10,100,1000,10000]\n",
    "\n",
    "fig, ax = plt.subplots()\n",
    "counts, bins, patches = ax.hist(df_cgroup['company_number'].apply(len), bins=mybins )\n",
    "\n",
    "bin_centers = 0.5 * np.diff(bins) + bins[:-1]\n",
    "for label, x in zip(['1','2-10','10-100','100-1000','1000+'], bin_centers):\n",
    "    ax.annotate(label, xy=(x, 0), xycoords=('data', 'axes fraction'),   xytext=(0,-10), textcoords='offset points',  va='top', ha='right')\n",
    "\n",
    "ax.tick_params(labelbottom=False)\n",
    "ax.xaxis.set_label_coords(0,-0.1)\n",
    "ax.xaxis.set_tick_params(which='minor', bottom=False)\n",
    "ax.set_xlabel('Number of controlled companies')\n",
    "ax.set_ylabel('Count')\n",
    "ax.set_title('Distribution of significant company control')\n",
    "\n",
    "ax.set_yscale('log')\n",
    "ax.set_xscale('log')\n",
    "\n",
    "fig.tight_layout()    \n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cadcc0c1",
   "metadata": {},
   "source": [
    "## Step6 - Cluster Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d9ed73a3",
   "metadata": {},
   "outputs": [],
   "source": [
    "linker.cluster_studio_dashboard(df_predict, clusters, \"Chapter7_cluster_studio.html\", \n",
    "                                cluster_ids = df_cselect['cluster_id'].to_list(),\n",
    "                                overwrite=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a67439cf",
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.display import IFrame\n",
    "\n",
    "IFrame(\n",
    "    src=\"Chapter7_cluster_studio.html\", width=\"100%\", height=1200\n",
    ")  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a5b81255",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "handsonentityresolution",
   "language": "python",
   "name": "handsonentityresolution"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
